This .Rmd file use the Jack’s Car Rental example (example 4.2 in the RL book) to show illustrate dynamic program (i.e. policy evaluation, policy improvement, and policy iteration). Please find the description of the example below:
Example 4.2: Jack’s Car Rental Jack manages two locations for a nationwide car rental company. Each day, some number of customers arrive at each location to rent cars. If Jack has a car available, he rents it out and is credited \(\$ 10\) by the national company. If he is out of cars at that location, then the business is lost. Cars become available for renting the day after they are returned. To help ensure that cars are available where they are needed, Jack can move them between the two locations overnight, at a cost of \(\$ 2\) per car moved. We assume that the number of cars requested and returned at each location are Poisson random variables, where \(\lambda\) is the expected number. Suppose \(\lambda\) is 3 and 4 for rental requests at the first and second locations and 3 and 2 for returns. To simplify the problem slightly, we assume that there can be no more than 20 cars at each location (any additional cars are returned to the nationwide company, and thus disappear from the problem) and a maximum of 5 cars can be moved from one location to the other in one night. We take the discount rate to be \(\gamma=0.9\) and formulate this as a continuing finite MDP, where the time steps are days, the state is the number of cars at each location at the end of the day, and the actions are the net numbers of cars moved between the two locations overnight.
Let states \(s = (s_1, s_2)\) with \((s_1, s_2 = 0, 1, \dots, 20)\). Actions \(a \in \{0, \pm1, \pm2, \pm3, \pm4, \pm5\}\) means the number of cars from location 1 to location 2.
We write the R function to compute the MDP transition probabilities \(P(S_{t+1}=s'|S_t=s, A_t=a)\) and expected rewards \(E(R_t=r|S_t=s,A_t=a)\) here. Note that action \(a\) should satisfy the following constraints: \[0 \le s_1 - a \le 20, \quad 0 \le s_2+a \le 20, \quad -5 \le a \le 5.\]
######### MDP function to compute p(s' | s, a) and E(r | s, a) ##########
MDP_prob <- function(s, a){
##need to ensure: max(s[1]-20, -s[2], -5) <= a <= min(s[1], 20-s[2], 5)
##### update state #####
s[1] <- s[1] - a
s[2] <- s[2] + a
##### compute E(r|s,a) #####
R1 <- -2 * abs(a) #cost of moving cars
pd1 <- prob_rent(s[1], lambda = 3)
pd2 <- prob_rent(s[2], lambda = 4)
p1 <- pd1$prob
p2 <- pd2$prob
R2 <- 0 # expectation of money earned
for (j in 0:s[1]){
for (k in 0:s[2]){
r <- 10 * (j+k)* p1[j+1]*p2[k+1]
R2 <- R2 + r
}
}
R_expected <- R1 + R2
##### compute p(s'|s,a) #####
p_end1 <- prob_end(s[1], lambda=3, p1)
p_end2 <- prob_end(s[2], lambda=2, p2)
P_trans <- c(tcrossprod(p_end1, p_end2))
return(list(P_trans = P_trans, R_expected = R_expected))
}
########## probability of # of cars rented for one location ##########
prob_rent <- function(current, lambda){
if (current==0){
p <- 1
} else {
p <- dpois(x = 0:(current-1), lambda = lambda)
p <- c(p, 1-sum(p))
}
return(data.frame(rent=0:current, prob=p))
}
########## probability of # of cars at night for one location ##########
prob_end <- function(current, lambda, p_rent){
p_end <- rep(0, 21)
p_rent <-
p_rent_rev <- p_rent[length(p_rent):1]
for (j in 0:current){ ## j is the number of cars after rent
if (j==20){
p <- 1
} else {
p <- dpois(x = c(0:(19-j)), lambda = lambda)
p <- c(p, 1-sum(p))
}
idx <- (j:20)
p_end[idx+1] <- p_end[idx+1] + p * p_rent_rev[j+1]
}
return(p_end)
}
Set initial policy as \[\pi(a \mid s) = I(a=0). \]
## policy function (initial)
policy0 <- function(s, a){
p <- ifelse(a==0, 1, 0)
return (p)
}
By Bellman’s equation, \[ v_{\pi}\left(s\right) = \sum_{a} \pi(a \mid s) \left\{E(r \mid s, a) + \gamma \sum_{s'} p(s' \mid s, a) v_{\pi}\left( s' \right) \right\}. \] We can find \(v_{\pi}\left(s\right)\) by solving the above linear system either directly or via dynamic programming. Here, we directly solve the linear system.
############# policy evaluation #############
policy_eva <- function(gamma, policy, ...){
s_all <- expand.grid(0:20, 0:20)
m <- nrow(s_all)
R <- rep(0, m)
H <- matrix(0, m, m)
for (j in 1:m){
for (a in (-5:5)){
s <- as.numeric(s_all[j,])
lb <- max(s[1]-20, -s[2])
ub <- min(s[1], 20-s[2])
if ((a>=lb) & (a<=ub)) { ## check if a is proper
p_as <- policy(s, a, ...)
mdp_out <- MDP_prob(s, a)
r <- mdp_out$R_expected
p_ssa <- mdp_out$P_trans
R[j] <- R[j] + p_as * r
H[j,] <- H[j,] + p_as * p_ssa
}
}
}
V <- solve(diag(m) - gamma * H) %*% R
V_mat <- matrix(V, 21, 21)
return(V_mat)
}
We can also compute the Q value (i.e. state-action value) by \[ Q(s,a) = E(r\mid s, a) + \gamma\sum_{s'} p\left(s^{\prime} \mid s, a\right) v_{k}\left(s^{\prime}\right) \] Don’t remember to set \(Q(s,a) = -\infty\) or any large negative value when \(a\) is not proper for \(s\).
Q_value <- function(gamma, V){
s_all <- expand.grid(0:20, 0:20)
m <- nrow(s_all)
a_all <- (-5:5)
out <- matrix(-1e4, m, length(a_all))
for (j in 1:m){
s <- as.numeric(s_all[j,])
for (a in a_all){
lb <- max(s[1]-20, -s[2])
ub <- min(s[1], 20-s[2])
if ((a>=lb) & (a<=ub)) { ## check if a is proper
mdp_out <- MDP_prob(s, a)
r <- mdp_out$R_expected
p_ssa <- mdp_out$P_trans
out[j,a+6] <- r + gamma * crossprod(p_ssa, V)
}
}
}
return(out)
}
Then, we can give one-step update of (deterministic) policy by \[ a' = \mbox{argmax}_a Q(s,a) \]
Finally, we apply policy iteration by iteratively use policy evalution and policy improvement.
policy_iter <- function(gamma, policy0, tol=1e-3){
V_mat0 <- policy_eva(gamma=gamma, policy=policy0) # V value for initial policy
V0 <- c(V_mat0)
Q0 <- Q_value(gamma, V0) # Q value for initial policy
dict0 <- apply(Q0, 1, which.max) - 6 # update policy
ii <- 0
while(1){
ii <- ii + 1
V_mat1 <- policy_eva(gamma=gamma, policy=policy_improve, dict=dict0) # V value for updated policy
V1 <- c(V_mat1)
policy_mat0 <- matrix(dict0, 21, 21)
pp <- plot_policy(dict0, title=paste0("Iteration = ", ii))
print(pp)
if (max(abs(V1 - V0)) < tol ){ # convergence condition
break
} else {
V0 <- V1
Q0 <- Q_value(gamma, V0) # Q value for updated policy
dict0 <- apply(Q0, 1, which.max) - 6 # update policy
}
}
return(list(iter=ii,
V_mat = V_mat1,
policy = policy_mat0))
}
policy_improve <- function(s, a, dict){
s_all <- expand.grid(0:20, 0:20)
k <- which( (s[1]==s_all[,1]) & (s[2]==s_all[,2]) )
p <- ifelse(a==dict[k], 1, 0)
return(p)
}
plot_policy <- function(dict, title, if_policy=TRUE){
PD <- as.data.frame(expand.grid(0:20, 0:20))
names(PD) <- c("s1", "s2")
PD$Actions <- dict
pp <- ggplot(data = PD) +
geom_raster(aes(x = s1, y = s2, fill = Actions)) +
xlab("# of Cars at Location 1") +
ylab("# of Cars at Location 2") +
ggtitle(title) +
theme_bw(base_size = 14)
if (if_policy){
pp <- pp +
scale_fill_gradient2(low="blue", high="red", mid="white",
midpoint = 0)
}
return(pp)
}
Visualize policy at each policy iteration. The last policy is the optimal policy.
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.1.2
o <- policy_iter(gamma=0.9, policy0=policy0)
Visualize the value function under the optimal policy.
library(plotly)
## Warning: package 'plotly' was built under R version 4.1.2
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
plot_ly(type = 'surface', x=~0:20, y=~0:20, z=~o$V_mat)